The idea is to use the inferred dependency graph to compute each of its edge partial covariance. This could be very useful in particular to know the sign of the direct dependency. As two linked nodes are independent from all other nodes conditionnally on the neighborhood of the pair, we can restrict the space of nodes to this neighborhood, and compute the partial covariance conditionnally on it. Technically, this allows to reduce computation complexity by inversing smaller matrices. Moreover,tThese inverses should be of low dimension as we aim for sparse graphs.
setwd("/Users/raphaellemomal/these/R/codes")
source("/Users/raphaellemomal/these/R/codes/F_inference.R")
library(tidyverse)
library(ggraph)
library(tidygraph)
library(mvtnorm)
library(PLNmodels)
library(parallel)
library(grid)
library(gridExtra)
Toy network:
Graph1_30 <- readRDS("/Users/raphaellemomal/simulations/Simu/PLN.2.0/erdos/d/Sets_param/Graph1_30.rds")
graph=as_tbl_graph(Graph1_30$omega)
graph %>%
ggraph()+
geom_edge_link()+
geom_node_point()+
geom_node_text(label = 1:30, color="red", size=5)+theme_graph()
## Using `nicely` as default layout
Data and network inferences:
Inference is done with and without resampling.
n=200
p=30
Sigma=Graph1_30$sigma
Y=generator_PLN(Sigma,covariates=NULL,n=n)[[1]]
infEM=EMtree(PLNobject = PLN(Y~1))
##
## Initialization...
## Adjusting a PLN model with full covariance model
## Post-treatments...
## DONE!
## [1] 0.405
##
## Likelihoods: -10.24894 , -10.1945 , -10.19249 , -10.19103 , -10.18902 , -10.18613 , -10.18467 , -10.18267 , -10.18156 , -10.18156 ,
## Convergence took 1.35 secs and 10 iterations.
## Likelihood difference = 1.024035e-06
## Betas difference = 1.411848e-15
resampl_inf=ResampleEMtree(Y,cores = 3,S = 50, maxIter=40)
the following functions allow to get the partial covariances between two variables of a given dependency graph. This graph is described with an adjacency matrix ; here is tested the matrix of conditional probabilities computed by EMtree and the matrix of edges selection frequency computed with ResampleEMtree.
# @get_partialCov
# input : a particular edge in a given network, and a covariance matrix
# output : the partial covariance of this edge conditionnally on its neighborhood
get_partialCov<-function(edge,graph,Sigma){
from=edge[1]
to = edge[2]
p=dim(graph%N>%as_tibble())[1]
neighb= graph %>% activate(nodes) %>%
mutate(name=1:p, neighbor= ( (node_is_adjacent(from) | node_is_adjacent(to)) & !(name %in%c(from,to)) ) ) %>%
select(neighbor) %>% as.tibble()
indexNeighb= which(neighb==TRUE)
v=length(indexNeighb)+2
if(length(indexNeighb)!=0){
bloc1=1:2
bloc2 = 3:v
reduc=c(from,to,indexNeighb)
Sigma_red=Sigma[reduc,reduc]
Sigma_partiel = Sigma_red[bloc1,bloc1] - Sigma_red[bloc1,bloc2]%*%solve(Sigma_red[bloc2,bloc2])%*%Sigma_red[bloc2,bloc1]
partialCov=Sigma_partiel[1,2]
}else{
partialCov=Sigma[from,to]
}
return(partialCov)
}
# @edges_partialCov
# input : an adjacency describing a graph matrix that can be weighted, a covariance matrix
# and a bound to filter the initial weights
# output : a tibble with all edges from the graph, their weight and additional partial covariance
edges_partialCov=function(adjmat,Sigma,lowbound=0.1){
graph=adjmat%>%as_tbl_graph() %>% activate(edges) %>% filter(weight>lowbound)
edges = graph %E>% as_tibble()
list_edges=edges %>% select(from,to) %>% t() %>% as_tibble() %>% as.list()
vec_partialCov=unlist(lapply(list_edges,function(x) get_partialCov(x,graph,Sigma)))
vec_partialCov[which(vec_partialCov>0 & vec_partialCov<1e-15)]=0
edges$partialCov = vec_partialCov
print(summary(vec_partialCov))
return(edges)
}
# @visu_partialCov
# input : an adjacency matrix for a graph, a covariance matrix and a bound to filter edges according to weights
# output : a figure with original graph on the left and infered one on the right, with colors corresponding to the
# sign of the partial covariances that are computed.
# the filter boolean allows to drop edges with 0 partial covariance. This happen when the true Sigma is
# given.
visu_partialCov<-function(bound,Sigma,adjmat, filter=FALSE, title=""){
set.seed(1)
edges=edges_partialCov(adjmat,Sigma,bound)
edges %>% ggplot(aes(weight,partialCov))+geom_point()+theme_bw()
original_Layout<-create_layout(graph,"nicely")
graph_hat=adjmat%>%as_tbl_graph() %>% activate(edges) %>% filter(weight>bound)
g=graph%N>%
mutate(x=original_Layout$x,y=original_Layout$y) %>%
ggraph(layout="auto")+
geom_edge_link()+
geom_node_point()+
geom_node_text(label = 1:30, color="red", size=5)+theme_graph()
graph_hat=graph_hat %E>% mutate(partialCov=edges$partialCov)
if(filter) graph_hat=graph_hat %>% filter(partialCov!=0)
g1= graph_hat %N>% mutate(x=original_Layout$x,y=original_Layout$y) %>%
ggraph(layout="auto")+
geom_edge_link(aes(edge_width=weight, color=as.factor(sign(partialCov))))+
geom_node_point()+theme_graph()+
scale_edge_width_continuous(range=c(0.1,2))+
geom_node_text(label = 1:30, color="blue", size=5)+theme_graph()
grid.arrange(g,g1,ncol=2,nrow=1, top=textGrob(title, gp=gpar(fontsize=18,font=8)))
}
The sign of a partial covariance is the opposite of that of the elements of the precision matrix, due to the inversion (-det*…). Here we start with a precision matrix defined as a binary matrix with ones on true edges, and a loaded diagonal to ensure its positive definitiveness. Therefore all partial covariances must be negative.
Two probability thesholds are looked at : the natural 2/p, and 50%. Frequancies obtained with 2/p are then thresholded at 80%, those obtained with 50% are then tresholded at 5%.
adjmat_freq1=F_Vec2Sym(colMeans(1*(resampl_inf$Pmat>2/p)))
adjmat_freq2=F_Vec2Sym(colMeans(1*(resampl_inf$Pmat>0.3)))
adjmat_freq3=F_Vec2Sym(colMeans(1*(resampl_inf$Pmat>0.5)))
plnY=PLN(Y~1)
##
## Initialization...
## Adjusting a PLN model with full covariance model
## Post-treatments...
## DONE!
visu_partialCov(2/p,plnY$model_par$Sigma, infEM$ProbaCond,FALSE, title="Edge probabilities, 2/p" )
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.20336 -0.11323 -0.06415 -0.03826 0.06025 0.16503
visu_partialCov(0.3,plnY$model_par$Sigma, infEM$ProbaCond,FALSE, title="Edge probabilities, 30%" )
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.2830 -0.1900 -0.1561 -0.1184 -0.0760 0.1890
visu_partialCov(0.8, plnY$model_par$Sigma,adjmat_freq1,FALSE, title="Edge selection frequencies (2/p), 80%")
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.26470 -0.16986 -0.11717 -0.08711 -0.05067 0.18901
visu_partialCov(0.5, plnY$model_par$Sigma,adjmat_freq2,FALSE, title="Edge selection frequencies (30%), 50%")
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.28298 -0.17931 -0.17161 -0.12095 -0.09312 0.18901
visu_partialCov(0.05, plnY$model_par$Sigma,adjmat_freq3,FALSE, title="Edge selection frequencies (50%), 5%")
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.26742 -0.14949 -0.09530 -0.06593 0.05856 0.16582
We want less positive partial covariance in proportion after the threshold than before it.
partialCov_freq1=edges_partialCov(adjmat_freq1,plnY$model_par$Sigma,0)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.179186 -0.052406 -0.004716 -0.009795 0.036766 0.121685
partialCov_freq2=edges_partialCov(adjmat_freq2,plnY$model_par$Sigma,0)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.20520 -0.10466 -0.06266 -0.03941 0.04606 0.14490
partialCov_freq3=edges_partialCov(adjmat_freq3,plnY$model_par$Sigma,0)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.24492 -0.12750 -0.07334 -0.04968 0.05272 0.15412
partialCov_prob=edges_partialCov( infEM$ProbaCond,plnY$model_par$Sigma,0)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.177781 -0.036460 -0.000493 -0.005956 0.026390 0.120884
partialCov_prob %>% ggplot(aes(weight,partialCov))+geom_point()+geom_hline(yintercept=0,color="red")+
geom_vline(xintercept=2/30, col="blue")+labs(title="edges conditional probabilities")+theme_bw()
partialCov_freq1 %>% ggplot(aes(weight,partialCov))+geom_point()+geom_hline(yintercept=0,color="red")+
geom_vline(xintercept=0.8, col="blue")+labs(title="edges selection frequencies (2/p)")+theme_bw()
partialCov_freq2 %>% ggplot(aes(weight,partialCov))+geom_point()+geom_hline(yintercept=0,color="red")+
geom_vline(xintercept=0.5, col="blue")+labs(title="edges selection frequencies (30%)")+theme_bw()
partialCov_freq3 %>% ggplot(aes(weight,partialCov))+geom_point()+geom_hline(yintercept=0,color="red")+
geom_vline(xintercept=0.05, col="blue")+labs(title="edges selection frequencies (50%)")+theme_bw()
positiv_prop<-function(data,xint, title){
list_prop=lapply(seq(0,1,0.01), function(thresh){
n_lower=length(which(data$weight<thresh))
n_upper=length(which(data$weight>thresh))
prop_low=length(which(data$partialCov>0 & data$weight<thresh))/n_lower
prop_up=length(which(data$partialCov>0 & data$weight>thresh))/n_upper
return(c(prop_low,prop_up))
})
props=as_tibble(do.call(rbind,list_prop)[-c(1,101),]);colnames(props)=c("prop_before","prop_after")
props$thresh=seq(0,1,0.01)[-c(1,101)]
props %>% gather(key,value,-thresh) %>%
ggplot(aes(thresh,value,color=key))+geom_point()+geom_line()+theme_bw()+
geom_vline(xintercept = xint, linetype="dashed")+
labs(title=title, x="threshold",y="%")
}
positiv_prop(partialCov_prob,2/p,"edge probabilities")
positiv_prop(partialCov_freq1,0.5, "edge selection frequencies (2/p)")
positiv_prop(partialCov_freq2,0.5,"edge selection frequencies (30%)")
positiv_prop(partialCov_freq3,0.05,"edge selection frequencies (50%)")
original_edges=graph%E>%as_tibble()
good_pred<-function(data,xint, title){
join=full_join(data,original_edges, by=c("from","to")) %>%
mutate(original = !is.na(weight.y))
join %>% ggplot(aes(original,weight.x))+geom_violin()+geom_hline(yintercept=2/30)+theme_bw()
list_prop=lapply(seq(0,1,0.01), function(thresh){
n_lower=length(which(join$weight.x<thresh))
n_upper=length(which(join$weight.x>thresh))
low_true=length(which(join$original==TRUE & join$weight.x<thresh))/n_lower
high_false=length(which(join$original==FALSE & join$weight.x>thresh))/n_upper
return(c(low_true,high_false))
})
props=as_tibble(do.call(rbind,list_prop)[-c(1,101),]);colnames(props)=c("FN","FP")
props$thresh=seq(0,1,0.01)[-c(1,101)]
props %>% gather(error,value,-thresh) %>%
ggplot(aes(thresh,value,color=error))+geom_point()+geom_line()+theme_bw()+
geom_vline(xintercept = xint, linetype="dashed")+geom_hline(yintercept = 0.1)+labs(title = title)
}
good_pred(partialCov_prob,2/30,"edge probabilities")
good_pred(partialCov_freq1,0.9, "edge selection frequency (2/p)")
good_pred(partialCov_freq2,0.7, "edge selection frequency (30%)")
good_pred(partialCov_freq3,0.4, "edge selection frequency (50%)")